1 Setup

suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(parallel)
  library(pander)
  library(plotly)
  library(RColorBrewer)
  library(scatterplot3d)
  library(tidyverse)
  library(tximport)
  library(vsn)
})
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- readr::read_csv(here("doc/Samples.csv"),
                      col_types = cols(
                        col_character(),
                        col_factor(),
                        col_factor(),
                        col_factor(),
                        col_character()
                      ))

tx2gene translation table

tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.tsv.gz"),delim="\t",
                                 col_names=c("TXID","GENE")))

2 Raw data

filelist <- list.files(here("data/salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Sanity check to ensure that the data is sorted according to the sample info

samples <- samples[match(sub("_sortmerna_trimmomatic", "", basename(dirname(filelist))),
                          samples$sampleID),]
stopifnot(all(match(sub("_sortmerna_trimmomatic", "", basename(dirname(filelist))),
                    samples$sampleID) == 1:nrow(samples)))

name the file list vector

names(filelist) <- samples$Name

Read the expression at the gene level

counts <- suppressMessages(round(tximport(files = filelist, 
                                  type = "salmon",
                                  tx2gene = tx2gene)$counts))
counts <- subset(counts, select = -c(X_S339, P_S339))


samples <- samples[-c(8, 18),]

2.1 Quality Control

  • Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts), digits = 1),
        sum(sel),
        nrow(counts))
## [1] "25.3% percent (9378) of 37075 genes are not expressed"
  • Let us take a look at the sequencing depth, colouring by CHANGEME
dat <- tibble(x = colnames(counts), y = colSums(counts)) %>% 
  bind_cols(samples)

ggplot(dat, aes(x, y, fill = samples$Flexure)) + geom_col() + 
  scale_y_continuous(name = "reads") +
  theme(axis.text.x = element_text(angle = 90, size = 4), axis.title.x = element_blank())

  • Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(data.frame(value = log10(rowMeans(counts))), aes(x = value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name = "mean raw counts (log10)")
## Warning: Removed 9378 rows containing non-finite values (stat_density).

The same is done for the individual samples.

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Condition = samples$Flexure[match(ind, samples$Name)]) %>% 
  mutate(Tissue = samples$Tissue[match(ind, samples$Name)])

ggplot(dat, aes(x = values, group = ind, col = Tissue)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name ="per gene raw counts (log10)")
## Warning: Removed 240818 rows containing non-finite values (stat_density).

2.2 Export

dir.create(here("data/analysis/salmon"), showWarnings = FALSE, recursive = TRUE)
write.csv(counts, file = here("data/analysis/salmon/raw-unormalised-gene-expression_data-S339.csv"))

3 Data normalisation

3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples,
  design = ~ TW + Flexure)
## converting counts to integer mode
colnames(dds) <- as.character(colData(dds)$Name)
saveRDS(dds, file = here("data/analysis/salmon/dds-S339.rds"))
save(dds, file = here("data/analysis/salmon/dds-S339.rda"))

Check the size factors (i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P_R327 P_R301 P_R93 P_R275 P_R249 P_S131 P_S235 P_S105 P_S287
1.21 1.148 1.227 1.238 1.259 1.151 1.29 1.261 1.281
X_R249 X_R327 X_R301 X_R93 X_R275 X_S131 X_S235 X_S105 X_S287
0.8302 0.7911 0.7843 0.7906 0.8315 0.9091 0.893 0.8527 0.7554
boxplot(sizes, main = "Sequencing libraries size factor")

3.2 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
saveRDS(vst, file = here("data/analysis/salmon/vst-S339.rds"))
write.csv(vst, file = here("data/analysis/salmon/normalised-gene-expression_data-S339.csv"))
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(vst) > 0, ])

3.3 QC on the normalised data

3.3.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2, ]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar = 2

An the number of possible combinations

nlevel = nlevels(dds$Tissue) * nlevels(dds$Flexure) # 4 levels

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x = 1:length(percent), y = cumsum(percent)), aes(x = x, y = y)) +
  geom_line() + scale_y_continuous("variance explained (%)", limits = c(0, 100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept = nvar, colour = "red", linetype = "dashed", size = 0.5) + 
  geom_hline(yintercept = cumsum(percent)[nvar], colour = "red", linetype = "dashed", size = 0.5) +
  geom_vline(xintercept = nlevel, colour = "orange", linetype = "dashed", size = 0.5) + 
  geom_hline(yintercept = cumsum(percent)[nlevel], colour = "orange", linetype = "dashed", size = 0.5)
## Warning: Removed 5 row(s) containing missing values (geom_path).

3.3.2 3 first dimensions

mar = c(5.1, 4.1, 4.1, 2.1)

The PCA shows that a large fraction of the variance is explained by both variables.

scatterplot3d(pc$x[, 1],
              pc$x[, 2],
              pc$x[, 3],
              xlab = paste("Comp. 1 (", percent[1], "%)", sep = ""),
              ylab = paste("Comp. 2 (", percent[2], "%)", sep = ""),
              zlab = paste("Comp. 3 (", percent[3], "%)", sep = ""),
              color = pal[as.integer(dds$Tissue)],
              pch = c(17:19)[as.integer(dds$Flexure)])
legend("topleft",
       fill = pal[1:nlevels(dds$Tissue)],
       legend = levels(dds$Tissue))

legend("topright",
       pch = 17:19,
       legend = levels(dds$Flexure))

par(mar = mar)

3.3.3 2D

pc.dat <- bind_cols(PC1 = pc$x[, 1],
                    PC2 = pc$x[, 2],
                    samples)

p <- ggplot(pc.dat, aes(x = PC1, y = PC2, col = Tissue, shape = Flexure, text = Name)) + 
  geom_point(size = 2) + 
  ggtitle("Principal Component Analysis", subtitle = "variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis = list(title = paste("PC1 (", percent[1], "%)", sep = "")),
         yaxis = list(title = paste("PC2 (", percent[2], "%)", sep = "")))
# add replicate number, plot number instead of shapes

3.3.4 Heatmap

Filter for noise

conds <- factor(paste(samples$Flexure))
sels <- rangeFeatureSelect(counts = vst,
                           conditions = conds,
                           nrep = 5)

vst.cutoff <- 2
  • Heatmap of “all” genes (Added different margins and turned the Col labels 45 degrees)
par(mar=c(7, 4, 4, 2)+0.1)
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff + 1]], ]))),
          distfun = pearson.dist,
          hclustfun = function(X){hclust(X, method = "ward.D2")},
          labRow = NA, trace = "none",
          labCol = conds,
          margins = c(12, 8),
          srtCol = 45,
          col = hpal)

plot(as.hclust(hm$colDendrogram), xlab = "", sub = "")

4 Session Info

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.62.0                  tximport_1.22.0            
##  [3] forcats_0.5.1               stringr_1.4.0              
##  [5] dplyr_1.0.8                 purrr_0.3.4                
##  [7] readr_2.1.2                 tidyr_1.2.0                
##  [9] tibble_3.1.6                tidyverse_1.3.1            
## [11] scatterplot3d_0.3-41        RColorBrewer_1.1-2         
## [13] plotly_4.10.0               pander_0.6.4               
## [15] hyperSpec_0.100.0           xml2_1.3.3                 
## [17] ggplot2_3.3.5               lattice_0.20-45            
## [19] here_1.0.1                  gplots_3.1.1               
## [21] DESeq2_1.34.0               SummarizedExperiment_1.24.0
## [23] Biobase_2.54.0              MatrixGenerics_1.6.0       
## [25] matrixStats_0.61.0          GenomicRanges_1.46.1       
## [27] GenomeInfoDb_1.30.1         IRanges_2.28.0             
## [29] S4Vectors_0.32.3            BiocGenerics_0.40.0        
## [31] data.table_1.14.2          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.4.1        lazyeval_0.2.2        
##   [4] splines_4.1.2          crosstalk_1.2.0        BiocParallel_1.28.3   
##   [7] digest_0.6.29          htmltools_0.5.2        fansi_1.0.2           
##  [10] magrittr_2.0.2         memoise_2.0.1          tzdb_0.2.0            
##  [13] limma_3.50.1           Biostrings_2.62.0      annotate_1.72.0       
##  [16] modelr_0.1.8           vroom_1.5.7            jpeg_0.1-9            
##  [19] colorspace_2.0-3       blob_1.2.2             rvest_1.0.2           
##  [22] haven_2.4.3            xfun_0.29              hexbin_1.28.2         
##  [25] crayon_1.5.0           RCurl_1.98-1.6         jsonlite_1.7.3        
##  [28] genefilter_1.76.0      survival_3.2-13        glue_1.6.1            
##  [31] gtable_0.3.0           zlibbioc_1.40.0        XVector_0.34.0        
##  [34] DelayedArray_0.20.0    scales_1.1.1           DBI_1.1.2             
##  [37] Rcpp_1.0.8             viridisLite_0.4.0      xtable_1.8-4          
##  [40] bit_4.0.4              preprocessCore_1.56.0  htmlwidgets_1.5.4     
##  [43] httr_1.4.2             ellipsis_0.3.2         pkgconfig_2.0.3       
##  [46] XML_3.99-0.9           farver_2.1.0           sass_0.4.0            
##  [49] dbplyr_2.1.1           locfit_1.5-9.4         utf8_1.2.2            
##  [52] tidyselect_1.1.1       labeling_0.4.2         rlang_1.0.0           
##  [55] AnnotationDbi_1.56.2   munsell_0.5.0          cellranger_1.1.0      
##  [58] tools_4.1.2            cachem_1.0.6           cli_3.2.0             
##  [61] generics_0.1.2         RSQLite_2.2.10         broom_0.7.12          
##  [64] evaluate_0.15          fastmap_1.1.0          yaml_2.3.5            
##  [67] knitr_1.37             bit64_4.0.5            fs_1.5.2              
##  [70] caTools_1.18.2         KEGGREST_1.34.0        brio_1.1.3            
##  [73] compiler_4.1.2         rstudioapi_0.13        png_0.1-7             
##  [76] testthat_3.1.2         affyio_1.64.0          reprex_2.0.1          
##  [79] geneplotter_1.72.0     bslib_0.3.1            stringi_1.7.6         
##  [82] highr_0.9              Matrix_1.3-4           vctrs_0.3.8           
##  [85] pillar_1.7.0           lifecycle_1.0.1        BiocManager_1.30.16   
##  [88] jquerylib_0.1.4        bitops_1.0-7           R6_2.5.1              
##  [91] latticeExtra_0.6-29    affy_1.72.0            KernSmooth_2.23-20    
##  [94] gtools_3.9.2           assertthat_0.2.1       rprojroot_2.0.2       
##  [97] withr_2.4.3            GenomeInfoDbData_1.2.7 hms_1.1.1             
## [100] rmarkdown_2.12         lubridate_1.8.0